Prepare data for analysis

# load libraries
suppressPackageStartupMessages({
  library(scater)
  library(SingleCellExperiment)
})
## Warning: package 'SummarizedExperiment' was built under R version 3.6.2
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## Warning: package 'DelayedArray' was built under R version 3.6.2
## Warning: package 'BiocParallel' was built under R version 3.6.2
# load qc'd data
sce <- readRDS("data/sceQC.rds")

Correct for technical differences between libraries (Normalize)

https://osca.bioconductor.org/normalization.html

Library size normalization

Divide all counts for each cell by a cell-specific size factor. Assumes that all genes within a cell are affected equally.

Library size is the total sum of counts across all genes for each cell. Library size factor is library size, normalized so that mean size factor across all cells is 1.

Get library size factors

libsize <- librarySizeFactors(sce)
summary(libsize)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  0.004238  0.386809  0.653320  1.000000  1.445357 10.214326
hist(libsize)

hist(log10(libsize))

Note: This normalization technique assumes that each cell has a similar total number of transcripts, ie upregulation of one set of genes means balanced downregulation of a different set of genes. When this is not the case, you will get composition bias and innacurate normalization values. This may not be an issue for exploratory analysis because you will still get the same clusters, just with inaccurate distances between clusters. Options to correct for composition bias include normalization by convolution, and spike-in normalization.

Scale values

Scale by size factor, then log-transform values.

library(scran)
## Warning: package 'scran' was built under R version 3.6.2
set.seed(1000)
clusters <- quickCluster(sce)
table(clusters)
## clusters
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  803  122  147 1947  428  405  434  878  510  281  426  924  181  712  248  301 
##   17   18 
##  287  171
sce <- computeSumFactors(sce, cluster=clusters)
## Warning in FUN(...): encountered negative size factor estimates
sce <- logNormCounts(sce)
assayNames(sce)
## [1] "counts"    "logcounts"

Feature Selection

Select genes that contain biological information and remove genes that contain random noise

Quantifying per-gene variation

https://osca.bioconductor.org/feature-selection.html#quantifying-per-gene-variation

Option 1: By variance of expression across cells

fit a trend to the variance with respect to abundance across all genes. This trend represents the technical noise.

dec <- modelGeneVar(sce)
dec
## DataFrame with 29286 rows and 6 columns
##                              mean                total                 tech
##                         <numeric>            <numeric>            <numeric>
## A1BG          0.00131245178380456  0.00113470759936777  0.00173097527675417
## A1BG-AS1       0.0295184112917586   0.0446854224698095    0.038928882676635
## A1CF          0.00217581820793955  0.00646234816241916  0.00286965711777376
## A2M             0.186422484940104    0.365301778888915    0.244981524408179
## A2M-AS1        0.0556500680845831   0.0980280131825152   0.0733793013194764
## ...                           ...                  ...                  ...
## hsa-mir-7515 0.000314887234502944 0.000327095076897506 0.000415300659501429
## hsa-mir-8072    0.010563624069907   0.0170271419446693   0.0139321045309014
## snoR26       0.000564073951137366  0.00238711477296874 0.000743949750564854
## snoU2-30      0.00178432624538083  0.00304594899311063  0.00235332397457906
## snoZ40       0.000248721329966043 0.000569442471316782  0.00032803531353463
##                                bio              p.value                  FDR
##                          <numeric>            <numeric>            <numeric>
## A1BG         -0.000596267677386396    0.989975074786507    0.999999999779311
## A1BG-AS1       0.00575653979317453    0.159079031656266    0.486603571666508
## A1CF            0.0035926910446454 1.43650813404782e-17 2.21622854166644e-16
## A2M              0.120320254480736 0.000457319277195411  0.00284979117081208
## A2M-AS1         0.0246487118630388   0.0116761572338632   0.0548501251097815
## ...                            ...                  ...                  ...
## hsa-mir-7515 -8.82055826039235e-05    0.924182828828918    0.999999999779311
## hsa-mir-8072   0.00309503741376789   0.0668484776097457    0.246157897488013
## snoR26         0.00164316502240388 1.41166298624274e-50 5.90515127102321e-49
## snoU2-30      0.000692625018531564   0.0234692076399773    0.101257457996027
## snoZ40        0.000241407157782152 3.38279135696287e-07 2.99339861559819e-06
fit <- metadata(dec)
plot(fit$mean, fit$var, xlab="Mean of log-expression",
    ylab="Variance of log-expression")
curve(fit$trend(x), col="dodgerblue", add=TRUE, lwd=2)

dec[order(dec$bio, decreasing=TRUE),] 
## DataFrame with 29286 rows and 6 columns
##                     mean            total             tech                bio
##                <numeric>        <numeric>        <numeric>          <numeric>
## PRG4    3.10819917121007 8.33680384390871 2.37808423194192    5.9587196119668
## COL3A1  2.20123153207015 6.71405611825128 2.02497816843305   4.68907794981823
## COL1A2   2.2100474212612 6.16657849628065 2.02758869918712   4.13898979709352
## PLA2G2A 1.86553399507151 5.99557532270241 1.87931731593029   4.11625800677211
## CD74    3.77147833986318 6.80794871481592  2.6996303137858   4.10831840103012
## ...                  ...              ...              ...                ...
## RPL34   3.35028246456352 2.13102460889053 2.54104513460341 -0.410020525712887
## RPS6    3.50588094888864 2.18815796357584 2.60780307801807 -0.419645114442229
## RPL11   3.05049835525517 1.91141272087692 2.34772840964006 -0.436315688763137
## RPL23   2.74770929161865 1.76292907881772 2.22086443297396  -0.45793535415624
## GNAS    3.18199287465945 1.74240075628847 2.43178448404888 -0.689383727760414
##                      p.value                  FDR
##                    <numeric>            <numeric>
## PRG4    1.74001102020133e-64 9.75651440321398e-63
## COL3A1  2.20078162909224e-55 1.00946928236954e-53
## COL1A2  1.67024073776431e-43 5.86892227236964e-42
## PLA2G2A 9.01066742549993e-50 3.67384300981459e-48
## CD74     4.6488489634771e-25 9.60491626817727e-24
## ...                      ...                  ...
## RPL34       0.86198672598418    0.999999999779311
## RPS6       0.861330960451216    0.999999999779311
## RPL11      0.895186693054653    0.999999999779311
## RPL23      0.918035263590258    0.999999999779311
## GNAS       0.972174104937347    0.999999999779311

Option 2: by coefficient of variance squared (CV^2)

This gives us an error…….

Suggestion is to use variance of log counts

cv <- modelGeneCV2(sce)

Quantify technical noise

Separate biological noise from technical noise

You can use spike-ins but I don’t think we have spike-in data for some reason? The other option is to model gene variation based on a poisson distribution.

set.seed(0010101)
dec.pois <- modelGeneVarByPoisson(sce, block = sce$disease)
dec.pois[order(dec.pois$bio, decreasing=TRUE),]

Selecting highly-variable genes (HVG)

https://osca.bioconductor.org/feature-selection.html#hvg-selection

This gives us the top 10% of highly variable genes

chosenGenes <- getTopHVGs(dec, prop=0.1)
str(chosenGenes)
##  chr [1:1473] "PRG4" "COL3A1" "COL1A2" "PLA2G2A" "CD74" "IGHG4" "IGHG3" ...
sce.hvg <- sce[chosenGenes,]
sce.hvg
## class: SingleCellExperiment 
## dim: 1473 9205 
## metadata(0):
## assays(2): counts logcounts
## rownames(1473): PRG4 COL3A1 ... CKS2 PITPNC1
## rowData names(0):
## colnames: NULL
## colData names(36): cell_name barcode ... batch discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
saveRDS(sce.hvg, "data/sceHVG.rds")

Dimension reduction

sce <- readRDS("data/sceHVG.rds")
sce <- runPCA(sce)
reducedDims(sce)
## List of length 1
## names(1): PCA
dim(reducedDim(sce, "PCA"))
## [1] 9205   50

Determine the number of PCs to use for downstream analysis

Find elbow point to decide on the number of PCs to keep for downstream analysis

# By elbow point
percentVar <- attr(reducedDim(sce), "percentVar")
elbow <- PCAtools::findElbowPoint(percentVar)
elbow
## [1] 5
plot(percentVar, xlab="PC", ylab="Variance explained (%)")
abline(v = elbow, col="red")

# Using technical noise
set.seed(111001001)
denoised <- denoisePCA(sce, technical=dec)
ncol(reducedDim(denoised))
## [1] 5

Visualization

plotReducedDim(sce, dimred="PCA", colour_by="type")

plotReducedDim(sce, dimred="PCA", ncomponents=4,
    colour_by="type")

set.seed(00101001101)

# runTSNE() stores the t-SNE coordinates in the reducedDims
# for re-use across multiple plotReducedDim() calls.
sce <- runTSNE(sce, dimred="PCA")
plotReducedDim(sce, dimred="TSNE", colour_by="type")

set.seed(1100101001)
sce <- runUMAP(sce, dimred="PCA")
plotReducedDim(sce, dimred="UMAP", colour_by="type")

reducedDims(sce)
## List of length 3
## names(3): PCA TSNE UMAP
saveRDS(sce, "data/scePrepped.rds")

Color points by other factors

plotReducedDim(sce, dimred = "TSNE", colour_by="sample")

plotReducedDim(sce, dimred = "TSNE", colour_by="plate")

plotReducedDim(sce, dimred = "TSNE", colour_by="disease")

plotReducedDim(sce, dimred = "TSNE", colour_by="molecules")